Overview
There is mainly two parts to this story:
A simple easy model of the key relationship
A medium complexity smoothing analysis
The two data set used in this analysis are the Madison case and waste water concentration data.
## Date Site Cases N1 N1Error
## 435 2021-06-19 Madison 3 NA NA
## 436 2021-06-20 Madison 8 49443 8812
## 437 2021-06-21 Madison 1 90447 20429
## 438 2021-06-22 Madison 3 44587 12427
## 439 2021-06-23 Madison 4 14469 9155
## 440 2021-06-24 Madison NA 28023 7724
A simple display of the data shows the core components of this story. First that both data sets are extremely noisy. And that there is a hint of a relationship between the two signals.
FirstImpression <- FullDF%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=MinMaxNormalization(N1), color="N1",info=N1))+#compares N1 to Cases
geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
labs(y="variable min max normalized")+
ColorRule
ggplotly(FirstImpression,tooltip=c("info","Date"))
Cross correlation and Granger Causality are key component to this analysis. Cross correlation looks at the correlation at a range of time shifts and Granger analysis preform a test for predictive power. We find that there is to much noise to find significance.
TestDF1 <- FullDF%>%
FillNA("N1","Cases")%>%#filling in missing data in range with previous values
NoNa("N1","Cases")#removing rows from before both series started#filling in missing data in range with previous values
Cases <- TestDF1$Cases
N1 <- TestDF1$N1
ccf(Cases,N1,na.action=na.pass)
grangertest(Cases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 2.623 0.1065
grangertest(N1,Cases, order = 1)
## Granger causality test
##
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 0.1124 0.7377
IntermediateOutlierGraphic <- TRUE #
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend
FullDF2 <- FullDF%>%#Remove older data that clearly has no relationship to Cases
mutate(N1 = ifelse(Date < mdy("11/20/2020"),NA,N1))
FullDF3 <- FullDF2%>%
mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median,
na.r = TRUE,fill=NA),#Finding very smooth version of the data with no outliers
SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1),#Fixing issue where rollapply fills NA on right border
LargeError=N1>5*SmoothN1,#Calculating error Limits
N1=ifelse(LargeError,SmoothN1,N1))%>%#replacing data points that variance is to large
select(-SmoothN1,-LargeError)#Removing unneeded calculated columns
if(IntermediateOutlierGraphic){
OutlierGraphic <- FullDF%>%
mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median,
na.r = TRUE,fill=NA),#creating smooth data
SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1))%>%#Fixing issue where rollapply fills NA on right border)%>%
mutate(Outliers=Date < mdy("11/20/2020")|N1>5*SmoothN1)%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_point(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
geom_point(aes(y=MinMaxNormalization(N1), color="N1",shape=Outliers,info=N1))+#compares N1 to Cases
labs(y="variable min max normalized")
ggplotly(OutlierGraphic,tooltip=c("info","Date"))
}
Ref <- FullDF3%>%
NoNa("N1","Cases")%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=MinMaxNormalization(N1), color="N1", info = N1))+#compares N1 to Cases
geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info = Cases))+
labs(y="variable min max normalized")+
ColorRule
ggplotly(Ref,tooltip=c("info","Date"))
TestDF2 <- FullDF3%>%
FillNA("N1","Cases")%>%#filling in missing data in range with previous values
NoNa("N1","Cases")#removing rows from before both series started
Cases <- TestDF2$Cases
N1 <- TestDF2$N1
#library(tseries) Tests for stationarity. potential use is obvious
# kpss.test(Cases)
# adf.test(Cases)
# kpss.test(N1)
# adf.test(N1)
ccf(Cases,N1,na.action=na.pass)
grangertest(Cases, N1, order = 1)
grangertest(N1,Cases, order = 1)
A key component to this relationship is that the relationship between N1 and Case involves a gamma distribution modeling both the time between catching Covid-19 and getting a test and the concentration of the shedded particles. We found a gamma distribution with mean 11.73 days and a standard deviation of 7.68 match’s other research and gives good results.
SLDWidth <- 21
scale <- 5.028338
shape <- 2.332779 #These parameters are equivalent to the mean and sd above
weights <- dgamma(1:SLDWidth, scale = scale, shape = shape)
plot(weights,
main=paste("Gamma Distribution with mean = 11.73 days,and SD = 7.68"),
ylab = "Weight",
xlab = "Lag")
SLDSmoothedDF <- FullDF3%>%
mutate(
SLDCases = c(rep(NA,SLDWidth-1),#elimination of starting values not relevant as we have a 50+ day buffer of case data
rollapply(Cases,width=SLDWidth,FUN=weighted.mean,
w=weights,
na.rm = FALSE)))#no missing data to remove
SLDPlot = SLDSmoothedDF%>%
NoNa("N1","SLDCases")%>%#same plot as earlier but with the SLD smoothing
ggplot(aes(x=Date))+
geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases",info = SLDCases))+
geom_line(aes(y=MinMaxNormalization(N1), color="N1",info = N1))+
facet_wrap(~Site)+
labs(y="variable min max normalized")+
ColorRule
ggplotly(SLDPlot,tooltip=c("info","Date"))
The SLD Cause a signfigent improvement in the relationship now showing a statisticly signifigent test result
TestDF3 <- SLDSmoothedDF%>%
FillNA("N1","SLDCases")%>%#filling in missing data in range with previous values
NoNa("N1","SLDCases")#removing rows from before both series started
SLDCases <- TestDF3$SLDCases
N1 <- TestDF3$N1
ccf(SLDCases,N1,na.action=na.pass)
grangertest(SLDCases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 8.3701 0.004115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
##
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 16.616 5.974e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To isolate this relationship we used a primitive binning relationship. This clarifies the relationship we see hints of in the previous graphic and masks the noise in the data.
StartDate <- 1 #Where the binning starts
DaySmoothing <- 14 #The size of the bins
Lag <- 7 #The offset between Cases and N1
BinDF <- SLDSmoothedDF%>%
select(Date, Cases, N1)%>%
mutate(MovedCases = data.table::shift(Cases, Lag),#moving SLD lag days backwards
Week=(as.numeric(Date)+StartDate)%/%DaySmoothing)%>%#putting variables into bins via integer division
group_by(Week)%>%
#filter(Week>2670)%>%
summarise(BinnedCases=median(MovedCases, na.rm=TRUE), BinnedN1=(median((N1), na.rm=TRUE)), Date = mean(Date,na.rm = TRUE))#summarize data within bins.
BinedGraph <- BinDF%>%
NoNa("BinnedN1","BinnedCases")%>%#Remove NA
ggplot(aes(x=Date))+
geom_line(aes(y=MinMaxNormalization(BinnedN1), color="N1" , info = BinnedN1))+
geom_line(aes(y=MinMaxNormalization(BinnedCases), color="Cases", info = BinnedCases))+
labs(y="Binned variable min max normalized")+
ColorRule
ggplotly(BinedGraph,tooltip=c("info","Date"))
BinedGraph
BinedCorrGraph <- BinDF%>%
ggplot()+
geom_point(aes(x=BinnedCases, y=BinnedN1, info = Date))
ggplotly(BinedCorrGraph,tooltip=c("x","y","info"))
cor(BinDF$BinnedN1, BinDF$BinnedCases, use="pairwise.complete.obs")
## [1] 0.3365265
summary(lm(BinnedCases~BinnedN1, data=BinDF))
##
## Call:
## lm(formula = BinnedCases ~ BinnedN1, data = BinDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.54 -55.36 -31.51 40.55 240.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.637e+01 3.161e+01 1.783 0.0897 .
## BinnedN1 3.348e-04 2.095e-04 1.598 0.1257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.12 on 20 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.1133, Adjusted R-squared: 0.06891
## F-statistic: 2.554 on 1 and 20 DF, p-value: 0.1257
To generate this relationship without reducing the amount of data we rely on a loess smoothing of the data. The loess smoothing is a way of generating smooth curves from noisy data. The displayed plot shows the visual power of this smoothing. We see a relationship in the big patterns but also multiple sub patterns match. We see in general that smoothed N1 both lags and leads the case data.
SLDSmoothedDF$loessN1 <- loessFit(y=(SLDSmoothedDF$N1),
x=SLDSmoothedDF$Date, #create loess fit of the data
span=.2, #span of .2 seems to give the best result, not rigorously chosen
iterations=2)$fitted#2 iterations remove some bad patterns
SLDLoessGraphic <- SLDSmoothedDF%>%
NoNa("loessN1","SLDCases")%>%
ggplot(aes(x=Date))+
geom_line(aes(y=MinMaxNormalization(loessN1), color="loessN1" , info = loessN1))+
geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases" , info = SLDCases))+
facet_wrap(~Site)+
labs(y="variable min max normalized")+
ColorRule
ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))
The loess smoothing roughly doubled the correlation at all time lags. The two granger tests p value also got signifigently smaller
TestDF4 <- SLDSmoothedDF%>%
FillNA("loessN1","SLDCases")%>%#filling in missing data in range with previous values
NoNa("loessN1","SLDCases")#removing rows from before both series started
SLDCases <- TestDF4$SLDCases
N1 <- TestDF4$loessN1
ccf(SLDCases,N1,na.action=na.pass)
grangertest(SLDCases, N1, order = 1)
## Granger causality test
##
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 89.183 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
##
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
## Res.Df Df F Pr(>F)
## 1 279
## 2 280 -1 69.758 3.177e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1